* PPD_checks_TFP_price_dispersion.SAS -- Do validity checks of the CART imputations for 2002 and 2007 FHS industries;
* using the within-industry TFP and price dispersion statistics from 500 pairs of datasets;


%include "ASMimplibs.sas";


** options obs=5000;run;

%macro chk(d,i,s,data);

*      d = dataset name (based on product/industry);
*      i = implicate number;
*      s = last period of good data;  * changed from s< to s<=;

      data phy&d (keep = survu_id year ppsr1 phyflag price pqs piship lnq iake iaks lk ial lth iam lm iae le exit02 _IMPUTE_);
       	set fhs.phy&d._&data;
	if phyflag=. then phyflag=0;
        if _IMPUTE_ = &i;
      run;

      proc sort data=phy&d; by year survu_id; run; 

      /** Use the final PPSR50 outlier-trimmed sample from the Bureau-completed data
          to select the same sample from the CART-completed datasets. **/
      /* Note: rejCB&d should be empty. */
      data phy&d rejCB&d;
       merge phy&d (in=inCART) CB&d (in=inCB);
       by year survu_id;
       if inCART and inCB then output phy&d;
       else if inCB then output rejCB&d;
      run;

      proc sort data=phy&d; by survu_id year; run; 

      proc summary data=phy&d nway;
        class survu_id;
        var phyflag;
       output out=avgphy mean=aphyflag;
      run;

      data phy&d;
       merge phy&d avgphy ;
       by survu_id;
      run;

      data all;
       set phy&d;
       mer=1;
       if year<=&s;
      run;

       * 0) CREATE OTHER MEASURES OF PRODUCTIVITY;

       %macro make;

            * starting by creating pbard;
            * weighted geometric mean of price;
            * weights are now nq2 were pqs;
            data all;
             set all;
               lpricex = log(price);  
               nq2 = pqs*price;
               rnq2 = nq2/piship;  
            run;

            proc summary data=all nway;
              class year;
              var lpricex;             * recall lpricex=log(price);
              weight nq2;              * recall nq2=pqs*price;
              output out=avgprice mean=lpbar;
            run;

            data avgprice;
             set avgprice;
             pbar=exp(lpbar);
             mer=1;
             run;

            data p2002 (keep=norm mer);
             set avgprice;
             if year=2002;
             norm=pbar;
             mer=1;
             run;

            data avgprice (keep=year pbard);
             merge avgprice p2002;
             by mer;
             pbard=pbar/norm;
            run;

            proc sort data=all;by year;run;
            proc sort data=avgprice; by year; run;

           data all;
             merge all avgprice;
           by year;
	   phyq=pqs/ppsr1; 
           /* q2=nq2/pbard;    * nq1=price*phyq; */
       	   if phyq   > 0  then lqphy = log(phyq);  
           ltfp = lnq - (iake+iaks)*lk - ial*lth - iam*lm - iae*le; 
           ltfpphy =  lqphy -(iake+iaks)*lk - ial*lth -iam*lm -iae*le ; 

           if price>0 and pbard>0 then lprice2=log(price/pbard); * relative price;
           counter=1;
           run;

       %mend;
    
       %make;

     proc sort data=all;by year _impute_;
 
* 6) AND THE FINAL DATA IS;

     data &d.&i._&data;
      set all;
     run;


    proc means data=all p10 q1 q3 p90 NOPRINT;
     var ltfp ltfpphy lprice2;
     by year _impute_ ;
      output out=&d._dispersion (keep = year _impute_ 
                                             ltfp_p10 ltfp_q1 ltfp_q3 ltfp_p90 ltfp_sd
                                             ltfpphy_p10 ltfpphy_q1 ltfpphy_q3 ltfpphy_p90 ltfpphy_sd
                                             lprice2_p10 lprice2_q1 lprice2_q3 lprice2_p90 lprice2_sd) 
      p10(ltfp)=ltfp_p10 q1(ltfp)=ltfp_q1 q3(ltfp)=ltfp_q3 p90(ltfp)=ltfp_p90 stddev(ltfp)=ltfp_sd
      p10(ltfpphy)=ltfpphy_p10 q1(ltfpphy)=ltfpphy_q1 q3(ltfpphy)=ltfpphy_q3 p90(ltfpphy)=ltfpphy_p90 stddev(ltfpphy)=ltfpphy_sd
      p10(lprice2)=lprice2_p10 q1(lprice2)=lprice2_q1 q3(lprice2)=lprice2_q3 p90(lprice2)=lprice2_p90 stddev(lprice2)=lprice2_sd;
    run;

    data &d._dispersion&i;
     set &d._dispersion;

     ltfp_iqr = ltfp_q3 - ltfp_q1;
     ltfp_90_10 = ltfp_p90 - ltfp_p10;
     tfp_75_25_ratio = exp(ltfp_iqr);
     tfp_90_10_ratio = exp(ltfp_90_10);

     ltfpphy_iqr = ltfpphy_q3 - ltfpphy_q1;
     ltfpphy_90_10 = ltfpphy_p90 - ltfpphy_p10;
     tfpphy_75_25_ratio = exp(ltfpphy_iqr);
     tfpphy_90_10_ratio = exp(ltfpphy_90_10);

     lprice2_iqr = lprice2_q3 - lprice2_q1;
     lprice2_90_10 = lprice2_p90 - lprice2_p10;
     price2_75_25_ratio = exp(lprice2_iqr);
     price2_90_10_ratio = exp(lprice2_90_10);

    run;

 %mend;


%MACRO implicates_loop(M,data);

 %do i=1 %to &M;

   %chk(gasf,&i,2007,&data);
   %chk(boxesf,&i,2002,&data);
   %chk(breadf,&i,2002,&data); 
   %chk(icef,&i,2007,&data);  
   %chk(sugarf,&i,2002,&data);
   %chk(coffeef,&i,2002,&data); 
   %chk(plywoodf,&i,2002,&data);
   %chk(floorf,&i,2002,&data);   
   %chk(carbonf,&i,2002,&data);  


   data gasf_dispersion&i; set gasf_dispersion&i; industry = "gas    "; run;
   data sugarf_dispersion&i; set sugarf_dispersion&i; industry = "sugar  "; run;
   data icef_dispersion&i; set icef_dispersion&i; industry = "ice    "; run;
   data breadf_dispersion&i; set breadf_dispersion&i; industry = "bread  "; run;
   data coffeef_dispersion&i; set coffeef_dispersion&i; industry = "coffee "; run;
   data plywoodf_dispersion&i; set plywoodf_dispersion&i; industry = "plywood"; run;
   data floorf_dispersion&i; set floorf_dispersion&i; industry = "floor  "; run;
   data carbonf_dispersion&i; set carbonf_dispersion&i; industry = "carbon "; run;
   data boxesf_dispersion&i; set boxesf_dispersion&i; industry = "boxes  "; run;


   /* Stack the industry TFP dispersion estimates. */

   data all_dispersion&i;
    set  gasf_dispersion&i
        sugarf_dispersion&i
        icef_dispersion&i
        breadf_dispersion&i
        coffeef_dispersion&i
        plywoodf_dispersion&i
        floorf_dispersion&i
        carbonf_dispersion&i
        boxesf_dispersion&i   
        ;
   run;

  
 %end;  /* End of do loop that constructs separate dataset from each implicate.*/

 /* Stack the estimates from each implicate. */
 data all_dispersion_&data;
  set all_dispersion1;
 run; 

 %do i=2 %to &M;
    proc append base=all_dispersion_&data data=all_dispersion&i;
    run; 
 %end; 

%MEND; /* End of implicates_loop macro */

/* Get final PPSR50 outlier-trimmed sample from Bureau-completed data. 
   We will use these to select the same samples from the CART-completed datasets. */

data CBgasf     (keep = survu_id  year) ; set ppsr50.allgasf; run;
data CBboxesf   (keep = survu_id  year) ; set ppsr50.allboxesf; if year=2002; run;
data CBicef     (keep = survu_id  year) ; set ppsr50.allicef; run;
data CBbreadf   (keep = survu_id  year) ; set ppsr50.allbreadf; if year=2002; run;
data CBsugarf   (keep = survu_id  year) ; set ppsr50.allsugarf; if year=2002; run;
data CBcoffeef  (keep = survu_id  year) ; set ppsr50.allcoffeef; if year=2002; run;
data CBplywoodf (keep = survu_id  year) ; set ppsr50.allplywoodf; if year=2002; run;
data CBfloorf   (keep = survu_id  year) ; set ppsr50.allfloorf; if year=2002; run;
data CBcarbonf  (keep = survu_id  year) ; set ppsr50.allcarbonf; if year=2002; run;


proc sort data=CBgasf ; by year survu_id;run;

proc sort data=CBboxesf ; by year survu_id;run;
proc sort data=CBicef ; by year survu_id;run;
proc sort data=CBbreadf ; by year survu_id;run;
proc sort data=CBsugarf ; by year survu_id;run;
proc sort data=CBcoffeef ; by year survu_id;run;
proc sort data=CBplywoodf ; by year survu_id;run;
proc sort data=CBfloorf ; by year survu_id;run;
proc sort data=CBcarbonf ; by year survu_id;run;


%implicates_loop(500,imputes);
%implicates_loop(500,predicted);


%MACRO stack_ppsr50_datasets(d,M);

  data ppsr50.&d._imputes_check;
   set &d.1_imputes;
  run; 

  data ppsr50.&d._predicted_check;
   set &d.1_predicted;
  run; 


  %do i=2 %to &M;
     proc append base=ppsr50.&d._imputes_check data=&d.&i._imputes;
     run; 
     proc append base=ppsr50.&d._predicted_check data=&d.&i._predicted;
     run; 
  %end; 

%MEND;

%stack_ppsr50_datasets(sugarf,500);
%stack_ppsr50_datasets(boxesf,500);
%stack_ppsr50_datasets(breadf,500); 
%stack_ppsr50_datasets(gasf,500);
%stack_ppsr50_datasets(icef,500);  
%stack_ppsr50_datasets(coffeef,500); 
%stack_ppsr50_datasets(plywoodf,500);
%stack_ppsr50_datasets(floorf,500);   
%stack_ppsr50_datasets(carbonf,500);



%MACRO print_dispersion_stats(data);


proc sort data=all_dispersion_&data;
 by industry year _impute_;
run;


proc means data= all_dispersion_&data mean stddev noprint;
 var tfp_75_25_ratio tfp_90_10_ratio;
by industry year ;
 output out=tfp_disp (keep = industry year tfp_75_25_ratio_m tfp_90_10_ratio_m
 tfp_75_25_ratio_sd tfp_90_10_ratio_sd)
 mean(tfp_75_25_ratio)=tfp_75_25_ratio_m mean(tfp_90_10_ratio)=tfp_90_10_ratio_m
 stddev(tfp_75_25_ratio)=tfp_75_25_ratio_sd stddev(tfp_90_10_ratio)=tfp_90_10_ratio_sd;
run;

proc print data=tfp_disp;
 var industry year tfp_75_25_ratio_m tfp_90_10_ratio_m tfp_75_25_ratio_sd tfp_90_10_ratio_sd;
 title1 "Traditional TFP Dispersion measures by industry and year";
 title2 "CART &data data, 500-implicate mean and s.d.";
run;


proc means data= all_dispersion_&data mean stddev noprint;
 var tfpphy_75_25_ratio tfpphy_90_10_ratio;
by industry year;
 output out=tfpphy_disp (keep = industry year tfpphy_75_25_ratio_m tfpphy_90_10_ratio_m
 tfpphy_75_25_ratio_sd tfpphy_90_10_ratio_sd)
 mean(tfpphy_75_25_ratio)=tfpphy_75_25_ratio_m mean(tfpphy_90_10_ratio)=tfpphy_90_10_ratio_m
 stddev(tfpphy_75_25_ratio)=tfpphy_75_25_ratio_sd stddev(tfpphy_90_10_ratio)=tfpphy_90_10_ratio_sd;
run;


proc print data=tfpphy_disp;
 var industry year tfpphy_75_25_ratio_m tfpphy_90_10_ratio_m tfpphy_75_25_ratio_sd tfpphy_90_10_ratio_sd;
 title1 "Physical TFP Dispersion measures by industry and year";
 title2 "CART &data data, 500-implicate mean and s.d.";
run;



proc means data= all_dispersion_&data mean stddev noprint;
 var price2_75_25_ratio price2_90_10_ratio;
by industry year;
 output out=lprice2_disp (keep = industry year price2_75_25_ratio_m price2_90_10_ratio_m
 price2_75_25_ratio_sd price2_90_10_ratio_sd)
 mean(price2_75_25_ratio)=price2_75_25_ratio_m mean(price2_90_10_ratio)=price2_90_10_ratio_m
 stddev(price2_75_25_ratio)=price2_75_25_ratio_sd stddev(price2_90_10_ratio)=price2_90_10_ratio_sd;
run;


proc print data=lprice2_disp;
 var industry year price2_75_25_ratio_m price2_90_10_ratio_m price2_75_25_ratio_sd price2_90_10_ratio_sd;
 title1 "Price Dispersion measures by industry and year";
 title2 "CART &data  data, 500-implicate mean and s.d.";
run;

%MEND; 

%print_dispersion_stats(imputes);
%print_dispersion_stats(predicted);

/** Now compute validity checks based on dispersion statistics. */

proc datasets library=work;
 modify all_dispersion_imputes;
  rename tfp_75_25_ratio=tfp7525_imp; 
  rename tfp_90_10_ratio=tfp9010_imp;
  rename tfpphy_75_25_ratio=tfpphy7525_imp ;
  rename tfpphy_90_10_ratio=tfpphy9010_imp ;
  rename price2_75_25_ratio=price7525_imp ;
  rename price2_90_10_ratio=price9010_imp ;
 modify all_dispersion_predicted;
  rename tfp_75_25_ratio=tfp7525_pred; 
  rename tfp_90_10_ratio=tfp9010_pred;
  rename tfpphy_75_25_ratio=tfpphy7525_pred ;
  rename tfpphy_90_10_ratio=tfpphy9010_pred ;
  rename price2_75_25_ratio=price7525_pred ;
  rename price2_90_10_ratio=price9010_pred ;
run;
 
proc sort data=all_dispersion_imputes; by industry year _impute_; run;
proc sort data=all_dispersion_predicted; by industry year _impute_; run;

data all_dispersion;
 merge all_dispersion_imputes all_dispersion_predicted;
by industry year _impute_;
  if tfp7525_imp > tfp7525_pred then tfp7525_f =1; else tfp7525_f = 0;
  if tfp9010_imp > tfp9010_pred then tfp9010_f =1; else tfp9010_f = 0;
  if tfpphy7525_imp > tfpphy7525_pred then tfpphy7525_f =1; else tfpphy7525_f = 0;
  if tfpphy9010_imp > tfpphy9010_pred then tfpphy9010_f =1; else tfpphy9010_f = 0;
  if price7525_imp > price7525_pred then price7525_f =1; else price7525_f = 0;
  if price9010_imp > price9010_pred then price9010_f =1; else price9010_f = 0;
run;

proc means data=all_dispersion noprint;
 var tfp7525_f tfp9010_f tfpphy7525_f tfpphy9010_f price7525_f price9010_f;
by industry year;
output out=flagmeans mean(tfp7525_f)=tfp7525_f_mean mean(tfp9010_f)=tfp9010_f_mean mean(tfpphy7525_f)=tfpphy7525_f_mean mean(tfpphy9010_f)=tfpphy9010_f_mean mean(price7525_f)=price7525_f_mean mean(price9010_f)=price9010_f_mean ;
run;

data flagmeans;
 set flagmeans;
  if tfp7525_f_mean<0.5 then tfp7525_p = 2*tfp7525_f_mean;
  else tfp7525_p = 2*(1-tfp7525_f_mean);
  if tfp9010_f_mean<0.5 then tfp9010_p = 2*tfp9010_f_mean;
  else tfp9010_p = 2*(1-tfp9010_f_mean);
  if tfpphy7525_f_mean<0.5 then tfpphy7525_p = 2*tfpphy7525_f_mean;
  else tfpphy7525_p = 2*(1-tfpphy7525_f_mean);
  if tfpphy9010_f_mean<0.5 then tfpphy9010_p = 2*tfpphy9010_f_mean;
  else tfpphy9010_p = 2*(1-tfpphy9010_f_mean);
  if price7525_f_mean<0.5 then price7525_p = 2*price7525_f_mean;
  else price7525_p = 2*(1-price7525_f_mean);
  if price9010_f_mean<0.5 then price9010_p = 2*price9010_f_mean;
  else price9010_p = 2*(1-price9010_f_mean);
run;

proc print data=flagmeans;
 var industry year tfp7525_p tfp9010_p tfpphy7525_p tfpphy9010_p price7525_p price9010_p;
title1 "Synthetic p-values for TFP, TFPphy and price dispersion";
title2 "CART-completed vs. CART-predicted";
run;
